/*
 * Copyright (C) 2015 Edward Raff
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.jstarcraft.ai.jsat.io;

import static java.lang.Character.isWhitespace;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.jstarcraft.ai.jsat.DataSet;
import com.jstarcraft.ai.jsat.DataStore;
import com.jstarcraft.ai.jsat.SimpleDataSet;
import com.jstarcraft.ai.jsat.classifiers.CategoricalData;
import com.jstarcraft.ai.jsat.classifiers.ClassificationDataSet;
import com.jstarcraft.ai.jsat.classifiers.DataPoint;
import com.jstarcraft.ai.jsat.linear.DenseVector;
import com.jstarcraft.ai.jsat.linear.Vec;
import com.jstarcraft.ai.jsat.regression.RegressionDataSet;
import com.jstarcraft.ai.jsat.utils.StringUtils;

import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.ints.IntArrayList;

/**
 * Provides a reader and writer for CSV style datasets. This CSV reader supports
 * comments in CSVs (must begin with a single character) and categorical
 * features (columns must be specified when calling). Any number of newlines
 * will be treated as a single newline separating two rows.<br>
 * <br>
 * When reading and writing a CSV, if the delimiter or comment markers are not
 * specified - the defaults will be used {@link #DEFAULT_DELIMITER} and
 * {@link #DEFAULT_COMMENT} respectively.<br>
 * <br>
 * The CSV loader will treat empty columns as missing values for both numeric
 * and categorical features. A value of "NaN" in a numeric column will also be
 * treated as a missing value. Once loaded, missing values for numeric features
 * are encoded as {@link Double#NaN} and as <i>-1</i> for categorical features.
 *
 * @author Edward Raff
 */
public class CSV {
    public static final char DEFAULT_DELIMITER = ',';
    public static final char DEFAULT_COMMENT = '#';

    private CSV() {
    }

    /**
     * Reads in a CSV dataset as a regression dataset.
     *
     * @param numeric_target_column the column index (starting from zero) of the
     *                              feature that will be the target regression value
     * @param path                  the reader for the CSV content
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the regression dataset from the given CSV file
     * @throws IOException
     */
    public static RegressionDataSet readR(int numeric_target_column, Path path, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return readR(numeric_target_column, path, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in a CSV dataset as a regression dataset.
     *
     * @param numeric_target_column the column index (starting from zero) of the
     *                              feature that will be the target regression value
     * @param reader                the reader for the CSV content
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the regression dataset from the given CSV file
     * @throws IOException
     */
    public static RegressionDataSet readR(int numeric_target_column, Reader reader, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return readR(numeric_target_column, reader, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in a CSV dataset as a regression dataset.
     *
     * @param numeric_target_column the column index (starting from zero) of the
     *                              feature that will be the target regression value
     * @param path                  the CSV file to read
     * @param delimiter             the delimiter to separate columns, usually a
     *                              comma
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param comment               the character used to indicate the start of a
     *                              comment. Once this character is reached,
     *                              anything at and after the character will be
     *                              ignored.
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the regression dataset from the given CSV file
     * @throws IOException
     */
    public static RegressionDataSet readR(int numeric_target_column, Path path, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        BufferedReader br = Files.newBufferedReader(path, Charset.defaultCharset());
        RegressionDataSet ret = readR(numeric_target_column, br, delimiter, lines_to_skip, comment, cat_cols);
        br.close();
        return ret;
    }

    /**
     * Reads in a CSV dataset as a regression dataset.
     *
     * @param numeric_target_column the column index (starting from zero) of the
     *                              feature that will be the target regression value
     * @param reader                the reader for the CSV content
     * @param delimiter             the delimiter to separate columns, usually a
     *                              comma
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param comment               the character used to indicate the start of a
     *                              comment. Once this character is reached,
     *                              anything at and after the character will be
     *                              ignored.
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the regression dataset from the given CSV file
     * @throws IOException
     */
    public static RegressionDataSet readR(int numeric_target_column, Reader reader, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        return (RegressionDataSet) readCSV(reader, lines_to_skip, delimiter, comment, cat_cols, numeric_target_column, -1);
    }

    /**
     * Reads in a CSV dataset as a classification dataset. Comments assumed to start
     * with the "#" symbol.
     * 
     * @param classification_target the column index (starting from zero) of the
     *                              feature that will be the categorical target
     *                              value
     * @param path                  the CSV file to read
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the classification dataset from the given CSV file
     * @throws IOException
     */
    public static ClassificationDataSet readC(int classification_target, Path path, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return readC(classification_target, path, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in a CSV dataset as a classification dataset. Comments assumed to start
     * with the "#" symbol.
     * 
     * @param classification_target the column index (starting from zero) of the
     *                              feature that will be the categorical target
     *                              value
     * @param reader                the reader for the CSV content
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the classification dataset from the given CSV file
     * @throws IOException
     */
    public static ClassificationDataSet readC(int classification_target, Reader reader, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return readC(classification_target, reader, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in a CSV dataset as a classification dataset.
     * 
     * @param classification_target the column index (starting from zero) of the
     *                              feature that will be the categorical target
     *                              value
     * @param reader                the reader for the CSV content
     * @param delimiter             the delimiter to separate columns, usually a
     *                              comma
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param comment               the character used to indicate the start of a
     *                              comment. Once this character is reached,
     *                              anything at and after the character will be
     *                              ignored.
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the classification dataset from the given CSV file
     * @throws IOException
     */
    public static ClassificationDataSet readC(int classification_target, Reader reader, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        return (ClassificationDataSet) readCSV(reader, lines_to_skip, delimiter, comment, cat_cols, -1, classification_target);
    }

    /**
     * Reads in a CSV dataset as a classification dataset.
     * 
     * @param classification_target the column index (starting from zero) of the
     *                              feature that will be the categorical target
     *                              value
     * @param path                  the CSV file
     * @param delimiter             the delimiter to separate columns, usually a
     *                              comma
     * @param lines_to_skip         the number of lines to skip when reading in the
     *                              CSV (used to skip header information)
     * @param comment               the character used to indicate the start of a
     *                              comment. Once this character is reached,
     *                              anything at and after the character will be
     *                              ignored.
     * @param cat_cols              a set of the indices to treat as categorical
     *                              features.
     * @return the classification dataset from the given CSV file
     * @throws IOException
     */
    public static ClassificationDataSet readC(int classification_target, Path path, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        BufferedReader br = Files.newBufferedReader(path, Charset.defaultCharset());
        ClassificationDataSet ret = readC(classification_target, br, delimiter, lines_to_skip, comment, cat_cols);
        br.close();
        return ret;
    }

    /**
     * Reads in the given CSV dataset as a simple CSV file
     * 
     * @param path          the CSV file
     * @param lines_to_skip the number of lines to skip when reading in the CSV
     *                      (used to skip header information)
     * @param cat_cols      a set of the indices to treat as categorical features.
     * @return a simple dataset of the given CSV file
     * @throws IOException
     */
    public static SimpleDataSet read(Path path, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return read(path, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in the given CSV dataset as a simple CSV file
     * 
     * @param reader        the reader for the CSV content
     * @param lines_to_skip the number of lines to skip when reading in the CSV
     *                      (used to skip header information)
     * @param cat_cols      a set of the indices to treat as categorical features.
     * @return a simple dataset of the given CSV file
     * @throws IOException
     */
    public static SimpleDataSet read(Reader reader, int lines_to_skip, Set<Integer> cat_cols) throws IOException {
        return read(reader, DEFAULT_DELIMITER, lines_to_skip, DEFAULT_COMMENT, cat_cols);
    }

    /**
     * Reads in the given CSV dataset as a simple CSV file
     * 
     * @param path          the CSV file to read
     * @param delimiter     the delimiter to separate columns, usually a comma
     * @param lines_to_skip the number of lines to skip when reading in the CSV
     *                      (used to skip header information)
     * @param comment       the character used to indicate the start of a comment.
     *                      Once this character is reached, anything at and after
     *                      the character will be ignored.
     * @param cat_cols      a set of the indices to treat as categorical features.
     * @return a simple dataset of the given CSV file
     * @throws IOException
     */
    public static SimpleDataSet read(Path path, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        BufferedReader br = Files.newBufferedReader(path, Charset.defaultCharset());
        SimpleDataSet ret = read(br, delimiter, lines_to_skip, comment, cat_cols);
        br.close();
        return ret;
    }

    /**
     * Reads in the given CSV dataset as a simple CSV file
     * 
     * @param reader        the reader for the CSV content
     * @param delimiter     the delimiter to separate columns, usually a comma
     * @param lines_to_skip the number of lines to skip when reading in the CSV
     *                      (used to skip header information)
     * @param comment       the character used to indicate the start of a comment.
     *                      Once this character is reached, anything at and after
     *                      the character will be ignored.
     * @param cat_cols      a set of the indices to treat as categorical features.
     * @return a simple dataset of the given CSV file
     * @throws IOException
     */
    public static SimpleDataSet read(Reader reader, char delimiter, int lines_to_skip, char comment, Set<Integer> cat_cols) throws IOException {
        return (SimpleDataSet) readCSV(reader, lines_to_skip, delimiter, comment, cat_cols, -1, -1);
    }

    private static DataSet<?> readCSV(Reader reader, int lines_to_skip, char delimiter, char comment, Set<Integer> cat_col, int numeric_target, int cat_target) throws IOException {
        StringBuilder processBuffer = new StringBuilder(20);
        StringBuilder charBuffer = new StringBuilder(1024);
        char[] read_buffer = new char[1024];

        /**
         * The target values if doing regression
         */
        DoubleArrayList regressionTargets = new DoubleArrayList();
        /**
         * The target values if doing classification
         */
        IntArrayList catTargets = new IntArrayList();

        /**
         * Fist mapping is for each column that contains categorical variables. The
         * value map is a mapping from each string to its index, based on order seen.
         */
        Map<Integer, Map<String, Integer>> seenCats = new HashMap<>();
        for (int col : cat_col)
            if (col != cat_target)
                seenCats.put(col, new HashMap<>());
        /**
         * a mapping from each string to its index, based on order seen, for the target
         * class
         */
        Map<String, Integer> seenCats_target = new HashMap<>();

        /**
         * 
         */
        Map<Integer, Integer> cat_indx_to_csv_column = new HashMap<>();

        STATE state = STATE.INITIAL;
        int position = 0;

        /**
         * Negative value used to indicate that we don't know how many columns there are
         * yet. Once we process a single row, we set the number of columns seen so we
         * can sanity check
         */
        int totalCols = -1;
        DoubleArrayList numericFeats = new DoubleArrayList();
        IntArrayList catFeats = new IntArrayList();
        int cur_column = 0;

        List<Vec> all_vecs = new ArrayList<>();
        List<int[]> all_cats = new ArrayList<>();

        while (true) {

            while (charBuffer.length() - position <= 1)// make sure we have chars to handle
            {
                // move everything to the front
                charBuffer.delete(0, position);
                position = 0;

                int read = reader.read(read_buffer);
                if (read < 0)
                    break;
                charBuffer.append(read_buffer, 0, read);
            }

            if (charBuffer.length() - position == 0)// EOF, no more chars
            {
                // Look at the last state we were in before EOF
                if (state == STATE.NEWLINE) {
                    // nothing to do and everything already processed, just return
                    break;
                } else if (state == STATE.COMMENT) {
                    break;/// nothing to do, values should have already been added once we transition to
                          /// comment state
                } else if (state == STATE.VALUE)// line ended in the middle of processing
                {
                    charBuffer.append("\n");// append the wanted newline and let it run thought like normal
                } else
                    throw new RuntimeException();

            }

            // Normal processing of states
            char ch = charBuffer.charAt(position);
            switch (state) {
            case INITIAL:
                if (lines_to_skip > 0)
                    state = STATE.SKIPPING_ROWS;
                else
                    state = STATE.VALUE;
                break;
            case COMMENT:// comment behaves basically the same as SKIPPING ROWS
            case SKIPPING_ROWS:
                if (isNewLine(ch)) {
                    if (state == STATE.SKIPPING_ROWS)
                        lines_to_skip--;
                    state = STATE.NEWLINE;
                } else {
                    // keep moving till we hit a new line
                    position++;
                }
                break;
            case VALUE:

                if (ch == delimiter || isNewLine(ch) || ch == comment) {
                    // trim all the white space from the end of what we have been reading
                    while (processBuffer.length() > 0 && isWhitespace(processBuffer.charAt(processBuffer.length() - 1)))
                        processBuffer.setLength(processBuffer.length() - 1);

                    // clean up the value we are looking at
                    if (cat_col.contains(cur_column) || cur_column == cat_target) {
                        Map<String, Integer> map = (cur_column == cat_target) ? seenCats_target : seenCats.get(cur_column);
                        String cat_op = processBuffer.toString();
                        processBuffer.setLength(0);

                        int val;
                        if (cat_op.length() == 0)
                            val = -1;
                        else {
                            if (!map.containsKey(cat_op))
                                map.put(cat_op, map.size());
                            val = map.get(cat_op);
                        }

                        if (cur_column == cat_target)
                            if (val == -1)
                                throw new RuntimeException("Categorical column can't have missing values!");
                            else
                                catTargets.add(val);
                        else
                            catFeats.add(val);

                        if (cur_column != cat_target)
                            cat_indx_to_csv_column.put(catFeats.size() - 1, cur_column);
                    } else// numeric feature
                    {
                        double val;
                        if (processBuffer.length() == 0)
                            val = Double.NaN;
                        else
                            val = StringUtils.parseDouble(processBuffer, 0, processBuffer.length());
                        processBuffer.setLength(0);
                        if (cur_column == numeric_target) {
                            regressionTargets.add(val);
                        } else// normal storage
                        {
                            numericFeats.add(val);
                        }
                    }

                    // now do the state transitions
                    if (ch == delimiter)
                        state = STATE.DELIMITER;
                    else {
                        if (ch == comment)
                            state = STATE.COMMENT;
                        else
                            state = STATE.NEWLINE;

                        if (totalCols < 0)
                            totalCols = cur_column + 1;
                        else if (totalCols != cur_column + 1)
                            throw new RuntimeException("Inconsistent number of columns in CSV");

                        // add out stuff to the list
                        all_vecs.add(new DenseVector(numericFeats));
                        int[] cat_vals = new int[catFeats.size()];
                        for (int i = 0; i < cat_vals.length; i++)
                            cat_vals[i] = catFeats.getInt(i);
                        all_cats.add(cat_vals);

                        numericFeats.clear();
                        catFeats.clear();
                    }
                } else// process a character value
                {
                    if (processBuffer.length() == 0 && Character.isWhitespace(ch)) {
                        // don't add leading whitespace to the buffer, just move to next char
                        position++;
                    } else// normal value, add to buffer and increment to next char
                    {
                        processBuffer.append(ch);
                        position++;
                    }
                }

                break;
            case DELIMITER:

                if (ch == delimiter) {
                    position++;
                    cur_column++;
                    state = STATE.VALUE;
                } else
                    throw new RuntimeException("BAD CSV");// how did we get here?

                break;
            case NEWLINE:
                cur_column = 0;
                if (isNewLine(ch))
                    position++;
                else// now we move to next state
                {
                    if (lines_to_skip > 0) {
                        // keep skipping until we are out of lines to skip
                        state = STATE.SKIPPING_ROWS;
                    } else {
                        state = STATE.VALUE;
                    }
                }
                break;
            }
        }

        // ok, we read everything in - clean up time on the categorical features

        /**
         * we will sort each set of seen options so that we get the same feature index
         * ordering regardless of the order they occurred in the data
         */
        Map<Integer, Map<Integer, Integer>> cat_true_index = new HashMap<>();

        Map<Integer, CategoricalData> catDataMap = new HashMap<>();
        if (cat_target >= 0)// added so it gets processed easily below
            seenCats.put(cat_target, seenCats_target);
        CategoricalData target_data = null;
        for (Map.Entry<Integer, Map<String, Integer>> main_entry : seenCats.entrySet()) {
            HashMap<Integer, Integer> translator = new HashMap<>();
            int col = main_entry.getKey();
            Map<String, Integer> catsSeen = main_entry.getValue();
            List<String> sortedOrder = new ArrayList<>(catsSeen.keySet());
            Collections.sort(sortedOrder);

            CategoricalData cd = new CategoricalData(sortedOrder.size());
            if (col != cat_target)
                catDataMap.put(col, cd);
            else
                target_data = cd;
            for (int i = 0; i < sortedOrder.size(); i++) {
                translator.put(catsSeen.get(sortedOrder.get(i)), i);
                cd.setOptionName(sortedOrder.get(i), i);
            }

            cat_true_index.put(col, translator);
        }

        // go through and convert everything
        for (int[] cat_vals : all_cats) {
            for (int i = 0; i < cat_vals.length; i++) {
                if (cat_vals[i] >= 0)// if -1 its a missing value
                    cat_vals[i] = cat_true_index.get(cat_indx_to_csv_column.get(i)).get(cat_vals[i]);
            }
        }

        if (cat_target >= 0)// clean up the target value as well
        {
            Map<Integer, Integer> translator = cat_true_index.get(cat_target);
            for (int i = 0; i < catTargets.size(); i++)
                catTargets.set(i, translator.get(catTargets.get(i)));
        }

        // collect the categorical variable headers
        CategoricalData[] cat_array = new CategoricalData[catDataMap.size()];
        for (int i = 0; i < cat_array.length; i++)
            cat_array[i] = catDataMap.get(cat_indx_to_csv_column.get(i));

        if (cat_target >= 0) {
            ClassificationDataSet d = new ClassificationDataSet(totalCols - cat_array.length - 1, cat_array, target_data);
            d.setDataStore(DataStore.DEFAULT_STORE.emptyClone());
            for (int i = 0; i < all_vecs.size(); i++)
                d.addDataPoint(all_vecs.get(i), all_cats.get(i), catTargets.getInt(i));

            return d;
        } else if (numeric_target >= 0) {
            RegressionDataSet d = new RegressionDataSet(totalCols - cat_array.length - 1, cat_array);
            d.setDataStore(DataStore.DEFAULT_STORE.emptyClone());
            for (int i = 0; i < all_vecs.size(); i++)
                d.addDataPoint(all_vecs.get(i), all_cats.get(i), regressionTargets.getDouble(i));

            return d;
        } else {
            SimpleDataSet d = new SimpleDataSet(totalCols - cat_array.length, cat_array);
            d.setDataStore(DataStore.DEFAULT_STORE.emptyClone());
            for (int i = 0; i < all_vecs.size(); i++)
                d.add(new DataPoint(all_vecs.get(i), all_cats.get(i), cat_array));

            return d;
        }

    }

    /**
     * Writes out the given dataset as a CSV file. If the given dataset is a
     * regression or classification dataset, the target feature that is being
     * predicted will always be written out as the first index in the CSV. <br>
     * After that, all numeric features will be written out in order, followed by
     * the categorical features.
     *
     * @param data the dataset object to save as a CSV file
     * @param path the path to write the CSV to
     * @throws IOException
     */
    public static void write(DataSet<?> data, Path path) throws IOException {
        write(data, path, DEFAULT_DELIMITER);
    }

    /**
     * Writes out the given dataset as a CSV file. If the given dataset is a
     * regression or classification dataset, the target feature that is being
     * predicted will always be written out as the first index in the CSV. <br>
     * After that, all numeric features will be written out in order, followed by
     * the categorical features.
     *
     * @param data   the dataset object to save as a CSV file
     * @param writer the output writer to write the CSV to
     * @throws IOException
     */
    public static void write(DataSet<?> data, Writer writer) throws IOException {
        write(data, writer, DEFAULT_DELIMITER);
    }

    /**
     * Writes out the given dataset as a CSV file. If the given dataset is a
     * regression or classification dataset, the target feature that is being
     * predicted will always be written out as the first index in the CSV. <br>
     * After that, all numeric features will be written out in order, followed by
     * the categorical features.
     *
     * @param data      the dataset object to save as a CSV file
     * @param path      the path to write the CSV to
     * @param delimiter the delimiter between column values, normally a comma
     * @throws IOException
     */
    public static void write(DataSet<?> data, Path path, char delimiter) throws IOException {
        BufferedWriter bw = Files.newBufferedWriter(path, Charset.defaultCharset());
        write(data, bw, delimiter);
        bw.close();
    }

    /**
     * Writes out the given dataset as a CSV file. If the given dataset is a
     * regression or classification dataset, the target feature that is being
     * predicted will always be written out as the first index in the CSV. <br>
     * After that, all numeric features will be written out in order, followed by
     * the categorical features.
     *
     * @param data      the dataset object to save as a CSV file
     * @param writer    the output writer to write the CSV to
     * @param delimiter the delimiter between column values, normally a comma
     * @throws IOException
     */
    public static void write(DataSet<?> data, Writer writer, char delimiter) throws IOException {
        // first, create safe categorical feature names to write out
        String[][] catNamesToUse = getSafeNames(data.getCategories(), delimiter);
        String[] classNames = null;
        if (data instanceof ClassificationDataSet)
            classNames = getSafeNames(new CategoricalData[] { ((ClassificationDataSet) data).getPredicting() }, delimiter)[0];

        // write out every data point
        for (int i = 0; i < data.size(); i++) {
            if (i > 0)// write newline first
                writer.write('\n');
            boolean nothingWrittenYet = true;

            // target feature always goes at the front
            if (data instanceof ClassificationDataSet) {
                int targetClass = ((ClassificationDataSet) data).getDataPointCategory(i);
                writer.write(classNames[targetClass]);
                nothingWrittenYet = false;
            } else if (data instanceof RegressionDataSet) {
                double targetVal = ((RegressionDataSet) data).getTargetValue(i);
                writer.write(Double.toString(targetVal));
                nothingWrittenYet = false;
            }

            DataPoint dp = data.getDataPoint(i);
            Vec v = dp.getNumericalValues();
            int[] c = dp.getCategoricalValues();

            // write out numeric features first
            for (int j = 0; j < v.length(); j++) {
                if (!nothingWrittenYet)
                    writer.write(delimiter);

                // bellow handles NaN correctly, rint will just return NaN and then toString
                // prints "NaN"
                double val = v.get(j);
                if (Math.rint(val) == val)// cast to long before writting to save space
                    writer.write(Long.toString((long) val));
                else
                    writer.write(Double.toString(val));
                nothingWrittenYet = false;
            }
            // then categorical features, useing the safe names we constructed earlier
            for (int j = 0; j < c.length; j++) {
                if (!nothingWrittenYet)
                    writer.write(delimiter);
                if (c[j] >= 0)
                    writer.write(catNamesToUse[j][c[j]]);
                // else, its negative - which is missing, so not writing anything out should
                // result in the correct behavior
                nothingWrittenYet = false;
            }
        }

        writer.flush();
    }

    /**
     * Returns a DataWriter object which can be used to stream a set of arbitrary
     * datapoints into the given output stream. This works in a thread safe manner.
     * Uses the default delimiter {@link #DEFAULT_DELIMITER}
     *
     * @param out        the location to store all the data
     * @param catInfo    information about the categorical features to be written
     * @param dim        information on how many numeric features exist
     * @param predicting information on the class label, may be {@code null} if not
     *                   a classification dataset
     * @param type       what type of data set (simple, classification, regression)
     *                   to be written
     * @return the DataWriter that the actual points can be streamed through
     * @throws IOException
     */
    static public DataWriter getWriter(OutputStream out, CategoricalData[] catInfo, int dim, CategoricalData predicting, DataWriter.DataSetType type) throws IOException {
        return getWriter(out, catInfo, dim, predicting, type, DEFAULT_DELIMITER);
    }

    /**
     * Returns a DataWriter object which can be used to stream a set of arbitrary
     * datapoints into the given output stream. This works in a thread safe manner.
     *
     * @param out        the location to store all the data
     * @param catInfo    information about the categorical features to be written
     * @param dim        information on how many numeric features exist
     * @param predicting information on the class label, may be {@code null} if not
     *                   a classification dataset
     * @param type       what type of data set (simple, classification, regression)
     *                   to be written
     * @param delimiter  the character delimiter between features
     * @return the DataWriter that the actual points can be streamed through
     * @throws IOException
     */
    static public DataWriter getWriter(OutputStream out, CategoricalData[] catInfo, int dim, CategoricalData predicting, DataWriter.DataSetType type, final char delimiter) throws IOException {
        // first, create safe categorical feature names to write out
        final String[][] catNamesToUse = getSafeNames(catInfo, delimiter);
        final String[] classNames;
        if (DataWriter.DataSetType.CLASSIFICATION == type) {
            if (predicting == null)
                throw new RuntimeException("Can't create CSV writer without prediction target information (was null) ");
            classNames = getSafeNames(new CategoricalData[] { predicting }, delimiter)[0];
        } else
            classNames = null;

        DataWriter dw = new DataWriter(out, catInfo, dim, type) {
            @Override
            protected void writeHeader(CategoricalData[] catInfo, int dim, DataWriter.DataSetType type, OutputStream out) {
                // CSV format has no header!
            }

            @Override
            protected void pointToBytes(double weight, DataPoint dp, double label, ByteArrayOutputStream byteOut) {
                PrintWriter writer = new PrintWriter(byteOut);
                boolean nothingWrittenYet = true;

                // target feature always goes at the front
                if (type == DataWriter.DataSetType.CLASSIFICATION) {
                    int targetClass = (int) label;
                    writer.write(classNames[targetClass]);
                    nothingWrittenYet = false;
                } else if (type == DataWriter.DataSetType.REGRESSION) {
                    double targetVal = label;
                    writer.write(Double.toString(targetVal));
                    nothingWrittenYet = false;
                }

                Vec v = dp.getNumericalValues();
                int[] c = dp.getCategoricalValues();

                // write out numeric features first
                for (int j = 0; j < v.length(); j++) {
                    if (!nothingWrittenYet)
                        writer.write(delimiter);

                    // bellow handles NaN correctly, rint will just return NaN and then toString
                    // prints "NaN"
                    double val = v.get(j);
                    if (Math.rint(val) == val)// cast to long before writting to save space
                        writer.write(Long.toString((long) val));
                    else
                        writer.write(Double.toString(val));
                    nothingWrittenYet = false;
                }
                // then categorical features, useing the safe names we constructed earlier
                for (int j = 0; j < c.length; j++) {
                    if (!nothingWrittenYet)
                        writer.write(delimiter);
                    if (c[j] >= 0)
                        writer.write(catNamesToUse[j][c[j]]);
                    // else, its negative - which is missing, so not writing anything out should
                    // result in the correct behavior
                    nothingWrittenYet = false;
                }
                writer.write("\n");
                writer.flush();
            }
        };
        return dw;
    }

    private static String[][] getSafeNames(CategoricalData[] cats, char delimiter) {
        String[][] catNamesToUse = new String[cats.length][];
        final char delim_replacement;
        if (delimiter == '_')// avoid setting the replacment to the deliminater value itself!
            delim_replacement = '-';
        else
            delim_replacement = '_';
        for (int i = 0; i < catNamesToUse.length; i++) {
            catNamesToUse[i] = new String[cats[i].getNumOfCategories()];
            for (int j = 0; j < catNamesToUse[i].length; j++) {
                String name = cats[i].getOptionName(j).trim();

                if (name.contains(String.valueOf(delimiter)))
                    name = name.replace(delimiter, delim_replacement);

                catNamesToUse[i][j] = name;
            }
        }
        return catNamesToUse;
    }

    private static boolean isNewLine(char ch) {
        return ch == '\n' || ch == '\r';
    }

    /**
     * Simple state machine used to parse CSV files
     */
    private enum STATE {
        /**
         * Initial state, doesn't actually do anything
         */
        INITIAL,
        /**
         * Used when we start and want to skip some fixed number of rows in the file
         */
        SKIPPING_ROWS, VALUE, DELIMITER, NEWLINE,
        /**
         * When we encounter the comment start character, run till we hit the end of the
         * line
         */
        COMMENT,
    }
}
